import re
import csv
import codecs
import string
import datetime
import os
from bs4 import BeautifulSoup

def dehtml(text):
    soup=BeautifulSoup(text)
    return soup.get_text()


unnecessary = {'\n':''}
def removeunnecessary(text):
    for i, j in unnecessary.iteritems():
        text = text.replace(i, j)
        return text

#File names

folder='/Users/bde254/Desktop/France-Senate-Minutes/'
allfiles=[folder+file for file in os.listdir(folder)]


#Read and clean up the files

texts=[]

for file in allfiles:
	text={}
	name=file.split('/')[-1]
	print name
	text['name']=name
	htmltext=codecs.open(file,"r",encoding="latin-1").read()
	text['text']=dehtml(htmltext)
	texts.append(text)

votecounts=[]
for text in texts:
	votecount={}
	numbervotes=len(re.findall('mets aux voix',text['text']))
	votecount['filename']=text['name']
	votecount['votecount']=numbervotes
	votecounts.append(votecount)

totalcount=0
for votecount in votecounts:
    totalcount+=votecount['votecount']


